This document contains Traffic Signs Classifier lab with usage of LeNet architecture. This solution is implemented with usage of TensorFlow framework.
import matplotlib.pyplot as plt
import pandas as pd
from random import randint
import numpy as np
import pickle
import cv2
from sklearn.utils import shuffle
import tensorflow as tf
from tensorflow.contrib.layers import flatten
import os
import matplotlib.image as mpimg
sign_names = pd.read_csv('./signnames.csv').values
def get_sign_name(id):
for sign in sign_names:
if (sign[0] == id):
return sign[1]
%matplotlib inline
def print_examples_from_group(images, ids, grayscale = False):
line_length = 17
for sign in sign_names:
lane = plt.figure(figsize = (line_length, 1))
lane.subplots_adjust(hspace = 0, wspace = 0)
print("Id: {0}, name: {1}".format(sign[0], sign[1]))
sign_ids = np.where(ids==sign[0])[0]
for i in range(line_length):
index = randint(0, sign_ids.size - 1)
image = images[sign_ids[index]]
a = lane.add_subplot(1, line_length, i + 1, xticks=[], yticks=[])
if (grayscale):
a.imshow(image.squeeze(), cmap = "gray")
else:
a.imshow(image.squeeze())
plt.show()
First of all lets load and preview the data.
training_file = "train.p"
validation_file = "valid.p"
testing_file = "test.p"
with open(training_file, mode='rb') as f:
train = pickle.load(f)
with open(validation_file, mode='rb') as f:
valid = pickle.load(f)
with open(testing_file, mode='rb') as f:
test = pickle.load(f)
X_train_init, y_train = train['features'], train['labels']
X_valid_init, y_valid = valid['features'], valid['labels']
X_test_init, y_test = test['features'], test['labels']
assert(len(X_train_init) == len(y_train))
assert(len(X_valid_init) == len(y_valid))
assert(len(X_test_init) == len(y_test))
print()
print("Image Shape: {}".format(X_train_init[0].shape))
print()
print("Training Set: {} samples".format(len(X_train_init)))
print("Validation Set: {} samples".format(len(X_valid_init)))
print("Test Set: {} samples".format(len(X_test_init)))
Show amount of images for each traffic sign from train set. It is important to understand if there is enough train images for a sign in case of bad model performance.
hist, bins = np.histogram(y_train, bins=42)
plt.figure(figsize=(18, 15))
x_pos = [i for i, _ in enumerate(hist)]
plt.barh(x_pos, hist)
labs=[get_sign_name(j) for j in bins]
plt.yticks(x_pos, labs)
plt.show()
Show preview of the images
print_examples_from_group(X_train_init, y_train)
After images preview we can see that they are taken in different conditions (day, night, snow, with and without light reflection). On some images it is even hard to recognise the sign without preprocessing. If we train the model on this images without any preprocessing, model will give us just around 60% accuracy. So, there should be some preprocessing done for them. I tried several methods and came up with following preprocessing strategy:
There is also space for image augmenting: image rotating and transformation, but it should be performed in some intelligent way and not for all images — what will improve one image can "kill" another. So, there should be some image preprocessing, for example, recognition of sign shape and transforming it to normal. We can also try to use information of where on the image is the sign and how far is it, so we could know what transformation should be done to make shape normal.
Also, cropping of the images to sign size could help the network to learn.
def grayscale_image(img):
gs = cv2.cvtColor(img, cv2.COLOR_RGB2YCrCb)
return np.resize(gs[:,:,0], (32,32,1))
def equalise_histogram(img):
hist,bins = np.histogram(img.flatten(),256,[0,256])
cdf = hist.cumsum()
cdf_m = np.ma.masked_equal(cdf,0)
cdf_m = (cdf_m - cdf_m.min())*255/(cdf_m.max()-cdf_m.min())
cdf = np.ma.filled(cdf_m,0).astype('uint8')
return cdf[img]
def normalise_image(img):
return img/256 - 0.5
def preprocess_images(imgs):
res = []
for img in imgs:
gs = grayscale_image(img)
eq = equalise_histogram(gs)
nm = normalise_image(eq)
res.append(nm)
return res
X_train = preprocess_images(X_train_init)
X_valid = preprocess_images(X_valid_init)
X_test = preprocess_images(X_test_init)
Preview images after preprocessing
res = print_examples_from_group(X_train, y_train, grayscale = True)
EPOCHS = 50
BATCH_SIZE = 128
Implement neural network based on the LeNet-5 neural network architecture.
The LeNet architecture accepts a 32x32xC image as input, where C is the number of color channels (1 in my case because of grayscaling).
def LeNet(x):
# Arguments used for tf.truncated_normal, randomly defines variables for the weights and biases for each layer
mu = 0
sigma = 0.1
# Layer 1: Convolutional. Input = 32x32x1. Output = 28x28x43.
conv1_W = tf.Variable(tf.truncated_normal(shape=(5, 5, 1, 43), mean = mu, stddev = sigma))
conv1_b = tf.Variable(tf.zeros(43))
conv1 = tf.nn.conv2d(x, conv1_W, strides=[1, 1, 1, 1], padding='VALID') + conv1_b
# Activation
conv1 = tf.nn.relu(conv1)
# Pooling. Input = 28x28x43. Output = 14x14x43.
conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# Layer 2: Convolutional. Input = 14x14x43 Output = 10x10x80.
conv2_W = tf.Variable(tf.truncated_normal(shape=(5, 5, 43, 80), mean = mu, stddev = sigma))
conv2_b = tf.Variable(tf.zeros(80))
conv2 = tf.nn.conv2d(conv1, conv2_W, strides=[1, 1, 1, 1], padding='VALID') + conv2_b
# Activation.
conv2 = tf.nn.relu(conv2)
# Pooling. Input = 10x10x85. Output = 5x5x85.
conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# Layer 3: Convolutional. Input = 5x5x85. Output = 3x3x200.
conv3_W = tf.Variable(tf.truncated_normal(shape=(2, 2, 80, 200), mean = mu, stddev = sigma))
conv3_b = tf.Variable(tf.zeros(200))
conv3 = tf.nn.conv2d(conv2, conv3_W, strides=[1, 1, 1, 1], padding='VALID') + conv3_b
# Activation.
conv3 = tf.nn.relu(conv3)
# Pooling. Input = 4x4x200. Output = 2x2x200.
conv3 = tf.nn.max_pool(conv3, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# Layer 4: Convolutional. Output = 10x10x16.
# conv4_W = tf.Variable(tf.truncated_normal(shape=(2, 2, 200, 800), mean = mu, stddev = sigma))
# conv4_b = tf.Variable(tf.zeros(200))
# conv4 = tf.nn.conv2d(conv3, conv4_W, strides=[1, 1, 1, 1], padding='VALID') + conv4_b
# Activation.
# conv4 = tf.nn.relu(conv4)
# Pooling. Input = 10x10x16. Output = 5x5x16.
# conv4 = tf.nn.max_pool(conv4, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')
# Flatten. Input = 2x2x200. Output = 1x800.
fc0 = flatten(conv3)
# Layer 3: Fully Connected. Input = 800. Output = 120.
fc1_W = tf.Variable(tf.truncated_normal(shape=(800, 120), mean = mu, stddev = sigma))
fc1_b = tf.Variable(tf.zeros(120))
fc1 = tf.matmul(fc0, fc1_W) + fc1_b
# Activation.
fc1 = tf.nn.relu(fc1)
# Layer 4: Fully Connected. Input = 120. Output = 84.
fc2_W = tf.Variable(tf.truncated_normal(shape=(120, 84), mean = mu, stddev = sigma))
fc2_b = tf.Variable(tf.zeros(84))
fc2 = tf.matmul(fc1, fc2_W) + fc2_b
# Activation.
fc2 = tf.nn.relu(fc2)
# Layer 5: Fully Connected. Input = 84. Output = 43 (we have 43 classes of signs in the dataset).
fc3_W = tf.Variable(tf.truncated_normal(shape=(84, 43), mean = mu, stddev = sigma))
fc3_b = tf.Variable(tf.zeros(43))
logits = tf.matmul(fc2, fc3_W) + fc3_b
return logits
Train LeNet to classify traffic signs data.
x is a placeholder for a batch of input images.
y is a placeholder for a batch of output labels.
x = tf.placeholder(tf.float32, (None, 32, 32, 1))
y = tf.placeholder(tf.int32, (None))
one_hot_y = tf.one_hot(y, 43)
Create a training pipeline that uses the model to classify data.
rate = 0.001
logits = LeNet(x)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(labels=one_hot_y, logits=logits)
loss_operation = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate = rate)
training_operation = optimizer.minimize(loss_operation)
Evaluate how well the loss and accuracy of the model for a given dataset.
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(one_hot_y, 1))
accuracy_operation = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
saver = tf.train.Saver()
def evaluate(X_data, y_data):
num_examples = len(X_data)
total_accuracy = 0
sess = tf.get_default_session()
for offset in range(0, num_examples, BATCH_SIZE):
batch_x, batch_y = X_data[offset:offset+BATCH_SIZE], y_data[offset:offset+BATCH_SIZE]
accuracy = sess.run(accuracy_operation, feed_dict={x: batch_x, y: batch_y})
total_accuracy += (accuracy * len(batch_x))
return total_accuracy / num_examples
Run the training data through the training pipeline to train the model.
Before each epoch, shuffle the training set.
After each epoch, measure the loss and accuracy of the validation set.
Save the model every time when accuracy is improved.
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
num_examples = len(X_train)
print("Training...")
print()
max_val_accuracy = 0
for i in range(EPOCHS):
X_train, y_train = shuffle(X_train, y_train)
for offset in range(0, num_examples, BATCH_SIZE):
end = offset + BATCH_SIZE
batch_x, batch_y = X_train[offset:end], y_train[offset:end]
sess.run(training_operation, feed_dict={x: batch_x, y: batch_y})
validation_accuracy = evaluate(X_valid, y_valid)
print("EPOCH {} ...".format(i+1))
print("Validation Accuracy = {:.3f}".format(validation_accuracy))
print()
if validation_accuracy > max_val_accuracy:
saver.save(sess, './lenet')
max_val_accuracy = validation_accuracy
Evaluate the performance of the model on the test set.
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
test_accuracy = evaluate(X_test, y_test)
print("Test Accuracy = {:.3f}".format(test_accuracy))
I was able to reach around 98% accuracy on the validation set. It shows 95.6% accuracy on test set. It is not the best results and for sure not safe for production usage, so gives a room for improvements (I would suggest to stop near 99.99%). Improvements could be done in 3 basic places:
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
results = []
for i in range(43):
sign_ids = np.where(y_test==i)[0]
observed_images = []
observed_labels = []
for id in sign_ids:
observed_images.append(X_test[id])
observed_labels.append(i)
test_accuracy = evaluate(observed_images, observed_labels)
results.append(test_accuracy)
print("Evaluation per group")
plt.figure(figsize=(18, 15))
x_pos = range(43)
plt.barh(x_pos, results)
labs=[get_sign_name(j) for j in x_pos]
plt.yticks(x_pos, labs)
plt.show()
We see that some signs has bad accuracy in recognition. For some of them it is dangerous, for example, "Pedestrians". Possible solutions how to fix that:
One more interesting observation is that there is no correlation between amount of images in the train dataset and accuracy. This may be because some of images are just easier to detect and another are harder.
#printing out some stats and plotting
plt.close("all")
my_images_raw = []
my_labels = []
for file in os.listdir("my_image_set/"):
try:
image = mpimg.imread('my_image_set/' + file)
image = (image * 255).round().astype(np.uint8)
my_images_raw.append(image)
my_labels.append(int(file.split(".")[0]))
except:
print("Bad file " + file)
my_images = preprocess_images(my_images_raw)
def print_images(img, cmap = None):
lane = plt.figure(figsize = (len(img), 1))
lane.subplots_adjust(hspace = 0, wspace = 0)
for i in range (len(img)):
image = img[i]
a = lane.add_subplot(1, len(img), i + 1, xticks=[], yticks=[])
a.imshow(image.squeeze(), cmap = cmap)
plt.show()
print("My images test set: ")
print_images(my_images_raw, cmap = "gray")
print("Preprocessed: ")
print_images(my_images, cmap = "gray")
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
test_accuracy = evaluate(my_images, my_labels)
print("My images recognition accuracy = {:.3f}".format(test_accuracy))
TOP_K = 5
with tf.Session() as sess:
saver.restore(sess, tf.train.latest_checkpoint('.'))
my_images_softmax = sess.run(tf.nn.top_k(tf.nn.softmax(logits), k=TOP_K), feed_dict={x:my_images})
plt.figure(figsize=(18, 25))
for i in range(len(my_images)):
plt.subplot(12, 2, 2*i + 1)
image = my_images_raw[i]
plt.imshow(image.squeeze(), cmap = "gray")
plt.axis('off')
plt.subplot(12, 2, 2*i + 2)
plt.barh(np.arange(1, 6, 1), my_images_softmax.values[i, :])
labs=[get_sign_name(j) for j in my_images_softmax.indices[i]]
plt.yticks(np.arange(1, 6, 1), labs)
plt.show()
For my images taken from German Driving video model performs not very good. There are three signs with bad recognition results: